import plotly.express as px
from IPython.core.display import HTML
HTML("<script src='https://cdn.plot.ly/plotly-latest.min.js'></script>")
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
df = pd.read_csv('hdb-price-with-features-2013to2023.csv').iloc[:, 1:]
df.head()
| month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | lease_commence_date | remaining_lease | ... | mall_count_within_1km | mall_list_within_1km | school_shortest_dist | school_shortest_dist_name | good_sch_count_within_1km | good_sch_list_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-03-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 103.0 | Premium Apartment | 1998 | 74.333333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 1 | 2022-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 102.0 | Premium Apartment | 1998 | 74.583333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 2 | 2022-11-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 16 TO 18 | 102.0 | Premium Apartment | 1998 | 74.666667 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 3 | 2022-06-01 | BISHAN | 5 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 121.0 | Premium Apartment | 1998 | 75.083333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 4 | 2021-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 102.0 | Premium Apartment | 1998 | 75.583333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
5 rows × 30 columns
df
| month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | lease_commence_date | remaining_lease | ... | mall_count_within_1km | mall_list_within_1km | school_shortest_dist | school_shortest_dist_name | good_sch_count_within_1km | good_sch_list_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-03-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 103.0 | Premium Apartment | 1998 | 74.333333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 1 | 2022-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 102.0 | Premium Apartment | 1998 | 74.583333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 2 | 2022-11-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 16 TO 18 | 102.0 | Premium Apartment | 1998 | 74.666667 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 3 | 2022-06-01 | BISHAN | 5 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 121.0 | Premium Apartment | 1998 | 75.083333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| 4 | 2021-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 102.0 | Premium Apartment | 1998 | 75.583333 | ... | 1 | ['Junction 8'] | 0.214393 | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 217186 | 2013-01-01 | QUEENSTOWN | 3 ROOM | 26 | TANGLIN HALT RD | 07 TO 09 | 56.0 | Standard | 1973 | 59.000000 | ... | 0 | [] | 0.173506 | New Town Primary School | 0 | [] | 0 | 0 | 5.895374 | 1 |
| 217187 | 2013-04-01 | QUEENSTOWN | 3 ROOM | 64 | C'WEALTH DR | 10 TO 12 | 56.0 | Standard | 1969 | 55.000000 | ... | 0 | [] | 0.321240 | New Town Primary School | 0 | [] | 0 | 0 | 6.199401 | 1 |
| 217188 | 2013-04-01 | ANG MO KIO | 3 ROOM | 532 | ANG MO KIO AVE 10 | 01 TO 03 | 69.0 | New Generation | 1980 | 66.000000 | ... | 3 | ['Jubilee Square', 'AMK Hub', 'Broadway Plaza'] | 0.324541 | Jing Shan Primary School | 0 | [] | 0 | 0 | 9.498989 | 1 |
| 217189 | 2013-02-01 | BEDOK | EXECUTIVE | 10C | BEDOK STH AVE 2 | 01 TO 03 | 142.0 | Apartment | 1995 | 81.000000 | ... | 1 | ['Djitsun Mall Bedok'] | 0.237382 | Bedok Green Primary School | 0 | [] | 0 | 0 | 10.127127 | 1 |
| 217190 | 2013-01-01 | CENTRAL AREA | 3 ROOM | 269C | QUEEN ST | 04 TO 06 | 67.0 | New Generation | 1984 | 70.000000 | ... | 11 | ['Bugis+', 'Sim Lim Square', 'Bugis Junction',... | 0.526720 | Stamford Primary School | 1 | ['Tao Nan School'] | 2 | 2 | 1.428059 | 0 |
217191 rows × 30 columns
cpi = pd.read_csv('monthly cpi.csv')
cpi = cpi.set_index('Data Series')
cpi_data = pd.DataFrame(cpi.columns, columns=['month'])
cpi_data['CPI'] = pd.to_numeric(np.array(cpi.loc['All Items']))
cpi_data = cpi_data.iloc[1: , :]
cpi_data['month'] = pd.to_datetime(cpi_data['month'])
cpi_data.head()
| month | CPI | |
|---|---|---|
| 1 | 2023-01-01 | 111.397 |
| 2 | 2022-12-01 | 111.186 |
| 3 | 2022-11-01 | 110.959 |
| 4 | 2022-10-01 | 109.893 |
| 5 | 2022-09-01 | 110.339 |
cpi_data.dtypes
month datetime64[ns] CPI float64 dtype: object
df['month'] = pd.to_datetime(df['month'])
df = pd.merge(df, cpi_data, how = 'left')
df['adjusted_price'] = (df['resale_price'] / df['CPI'].fillna(100))*100
df['year'] = pd.DatetimeIndex(df['month']).year
df.head()
| month | town | flat_type | block | street_name | storey_range | floor_area_sqm | flat_model | lease_commence_date | remaining_lease | ... | school_shortest_dist_name | good_sch_count_within_1km | good_sch_list_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | CPI | adjusted_price | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2023-03-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 103.0 | Premium Apartment | 1998 | 74.333333 | ... | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 | NaN | 750388.000000 | 2023 |
| 1 | 2022-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 22 TO 24 | 102.0 | Premium Apartment | 1998 | 74.583333 | ... | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 | 111.186 | 764484.737287 | 2022 |
| 2 | 2022-11-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 16 TO 18 | 102.0 | Premium Apartment | 1998 | 74.666667 | ... | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 | 110.959 | 683135.212105 | 2022 |
| 3 | 2022-06-01 | BISHAN | 5 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 121.0 | Premium Apartment | 1998 | 75.083333 | ... | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 | 108.671 | 809783.658934 | 2022 |
| 4 | 2021-12-01 | BISHAN | 4 ROOM | 289 | BISHAN ST 24 | 07 TO 09 | 102.0 | Premium Apartment | 1998 | 75.583333 | ... | Catholic High School | 1 | ['Catholic High School'] | 3 | 2 | 7.635584 | 1 | 104.439 | 579285.515947 | 2021 |
5 rows × 33 columns
df.isna().sum()
month 0 town 0 flat_type 0 block 0 street_name 0 storey_range 0 floor_area_sqm 0 flat_model 0 lease_commence_date 0 remaining_lease 0 resale_price 0 address 0 latitude 0 longitude 0 shortest_dist_mrt_name 0 shortest_dist_mrt 0 mrt_count_within_1km 0 mrt_list_within_1km 0 shortest_dist_mall_name 0 shortest_dist_mall 0 mall_count_within_1km 0 mall_list_within_1km 0 school_shortest_dist 0 school_shortest_dist_name 0 good_sch_count_within_1km 0 good_sch_list_within_1km 0 good_sch_score 0 good_sch_score_sap_gep 0 distance_to_cbd_km 0 matured 0 CPI 2407 adjusted_price 0 year 0 dtype: int64
df['CPI'].dropna()
1 111.186
2 110.959
3 108.671
4 104.439
5 100.469
...
217186 98.121
217187 97.112
217188 97.112
217189 99.107
217190 98.121
Name: CPI, Length: 214784, dtype: float64
df.columns
Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
'floor_area_sqm', 'flat_model', 'lease_commence_date',
'remaining_lease', 'resale_price', 'address', 'latitude', 'longitude',
'shortest_dist_mrt_name', 'shortest_dist_mrt', 'mrt_count_within_1km',
'mrt_list_within_1km', 'shortest_dist_mall_name', 'shortest_dist_mall',
'mall_count_within_1km', 'mall_list_within_1km', 'school_shortest_dist',
'school_shortest_dist_name', 'good_sch_count_within_1km',
'good_sch_list_within_1km', 'good_sch_score', 'good_sch_score_sap_gep',
'distance_to_cbd_km', 'matured', 'CPI', 'adjusted_price', 'year'],
dtype='object')
data = df.drop(['month','block', 'street_name', 'lease_commence_date',
'resale_price','address', 'latitude', 'longitude',
'shortest_dist_mrt_name', 'mrt_list_within_1km',
'shortest_dist_mall_name','mall_list_within_1km',
'school_shortest_dist_name', 'good_sch_list_within_1km', 'CPI'],
axis = 1)
data.head()
| town | flat_type | storey_range | floor_area_sqm | flat_model | remaining_lease | shortest_dist_mrt | mrt_count_within_1km | shortest_dist_mall | mall_count_within_1km | school_shortest_dist | good_sch_count_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | adjusted_price | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BISHAN | 4 ROOM | 22 TO 24 | 103.0 | Premium Apartment | 74.333333 | 0.754215 | 1 | 0.824707 | 1 | 0.214393 | 1 | 3 | 2 | 7.635584 | 1 | 750388.000000 | 2023 |
| 1 | BISHAN | 4 ROOM | 22 TO 24 | 102.0 | Premium Apartment | 74.583333 | 0.754215 | 1 | 0.824707 | 1 | 0.214393 | 1 | 3 | 2 | 7.635584 | 1 | 764484.737287 | 2022 |
| 2 | BISHAN | 4 ROOM | 16 TO 18 | 102.0 | Premium Apartment | 74.666667 | 0.754215 | 1 | 0.824707 | 1 | 0.214393 | 1 | 3 | 2 | 7.635584 | 1 | 683135.212105 | 2022 |
| 3 | BISHAN | 5 ROOM | 07 TO 09 | 121.0 | Premium Apartment | 75.083333 | 0.754215 | 1 | 0.824707 | 1 | 0.214393 | 1 | 3 | 2 | 7.635584 | 1 | 809783.658934 | 2022 |
| 4 | BISHAN | 4 ROOM | 07 TO 09 | 102.0 | Premium Apartment | 75.583333 | 0.754215 | 1 | 0.824707 | 1 | 0.214393 | 1 | 3 | 2 | 7.635584 | 1 | 579285.515947 | 2021 |
# label encoding for Storey Range
# The higher the storey_range, the higher the label encoding
le = LabelEncoder()
data = data.sort_values(by='storey_range')
data['storey_range'] = le.fit_transform(data['storey_range'])
data
| town | flat_type | storey_range | floor_area_sqm | flat_model | remaining_lease | shortest_dist_mrt | mrt_count_within_1km | shortest_dist_mall | mall_count_within_1km | school_shortest_dist | good_sch_count_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | adjusted_price | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 149500 | YISHUN | 4 ROOM | 0 | 104.0 | Model A | 70.000000 | 1.121243 | 0 | 0.892542 | 2 | 0.165144 | 0 | 0 | 0 | 15.229937 | 0 | 3.523260e+05 | 2017 |
| 103026 | BEDOK | 5 ROOM | 0 | 121.0 | Improved | 62.666667 | 0.654595 | 2 | 1.937973 | 0 | 0.218787 | 0 | 0 | 0 | 8.307092 | 1 | 4.286070e+05 | 2019 |
| 39255 | TOA PAYOH | 3 ROOM | 0 | 68.0 | Improved | 47.250000 | 0.985550 | 1 | 1.454343 | 0 | 0.141342 | 1 | 1 | 1 | 5.664920 | 1 | 2.770357e+05 | 2021 |
| 103022 | BEDOK | 5 ROOM | 0 | 121.0 | Improved | 60.083333 | 0.654595 | 2 | 1.937973 | 0 | 0.218787 | 0 | 0 | 0 | 8.307092 | 1 | 5.025270e+05 | 2022 |
| 103019 | BEDOK | 5 ROOM | 0 | 137.0 | Model A-Maisonette | 70.000000 | 1.503240 | 0 | 1.173192 | 0 | 0.533189 | 1 | 1 | 1 | 10.065695 | 1 | 6.289691e+05 | 2013 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6448 | CENTRAL AREA | 4 ROOM | 16 | 94.0 | Type S1 | 88.250000 | 0.504834 | 2 | 0.125445 | 3 | 0.146162 | 0 | 0 | 0 | 1.803958 | 0 | 1.097620e+06 | 2021 |
| 32175 | CENTRAL AREA | 4 ROOM | 16 | 94.0 | Type S1 | 87.666667 | 0.300452 | 4 | 0.267698 | 4 | 0.303171 | 0 | 0 | 0 | 1.610956 | 0 | 1.141285e+06 | 2022 |
| 6513 | CENTRAL AREA | 4 ROOM | 16 | 94.0 | Type S1 | 94.000000 | 0.504834 | 2 | 0.264875 | 2 | 0.146162 | 0 | 0 | 0 | 1.803958 | 0 | 9.208849e+05 | 2015 |
| 129341 | CENTRAL AREA | 5 ROOM | 16 | 105.0 | Type S2 | 89.250000 | 0.264816 | 4 | 0.295279 | 4 | 0.284877 | 0 | 0 | 0 | 1.642962 | 0 | 1.246691e+06 | 2020 |
| 6683 | CENTRAL AREA | 4 ROOM | 16 | 94.0 | Type S1 | 92.833333 | 0.390244 | 3 | 0.178959 | 3 | 0.183819 | 0 | 0 | 0 | 1.723052 | 0 | 9.620044e+05 | 2017 |
217191 rows × 18 columns
data['flat_type'].value_counts()
4 ROOM 89434 3 ROOM 54630 5 ROOM 53198 EXECUTIVE 16709 2 ROOM 3035 1 ROOM 98 MULTI-GENERATION 87 Name: flat_type, dtype: int64
# remove flat_types with less than 1000 instances
flat_types_counts = data['flat_type'].value_counts()
data = data[~data['flat_type'].isin(flat_types_counts[flat_types_counts < 1000].index)]
data['flat_type'].value_counts()
4 ROOM 89434 3 ROOM 54630 5 ROOM 53198 EXECUTIVE 16709 2 ROOM 3035 Name: flat_type, dtype: int64
1 Room and Multi-Generation flats are removed.
# label encoding for Flat Type
data = data.sort_values(by='flat_type')
data
| town | flat_type | storey_range | floor_area_sqm | flat_model | remaining_lease | shortest_dist_mrt | mrt_count_within_1km | shortest_dist_mall | mall_count_within_1km | school_shortest_dist | good_sch_count_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | adjusted_price | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 139670 | SENGKANG | 2 ROOM | 3 | 47.0 | Model A | 92.500000 | 1.679652 | 0 | 0.612240 | 2 | 0.615462 | 1 | 1 | 0 | 12.972290 | 0 | 274890.723652 | 2021 |
| 23425 | JURONG EAST | 2 ROOM | 4 | 47.0 | Model A | 92.750000 | 1.198840 | 0 | 1.162130 | 0 | 1.548739 | 0 | 0 | 0 | 13.106423 | 0 | 228981.034397 | 2019 |
| 216836 | GEYLANG | 2 ROOM | 2 | 50.0 | Standard | 54.000000 | 0.441429 | 2 | 2.102417 | 0 | 0.728915 | 2 | 2 | 0 | 6.181252 | 1 | 281600.720898 | 2013 |
| 160100 | BUKIT MERAH | 2 ROOM | 0 | 49.0 | Standard | 56.000000 | 0.456153 | 1 | 0.483975 | 2 | 0.710211 | 0 | 0 | 0 | 2.309575 | 1 | 295792.054134 | 2013 |
| 23446 | JURONG EAST | 2 ROOM | 4 | 47.0 | Model A | 94.333333 | 1.198840 | 0 | 1.162130 | 0 | 1.548739 | 0 | 0 | 0 | 13.106423 | 0 | 232149.138234 | 2018 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 207515 | BISHAN | EXECUTIVE | 1 | 149.0 | Maisonette | 71.833333 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 861558.235513 | 2019 |
| 207514 | BISHAN | EXECUTIVE | 1 | 149.0 | Maisonette | 71.250000 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 911141.271806 | 2020 |
| 207512 | BISHAN | EXECUTIVE | 1 | 146.0 | Maisonette | 70.750000 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 829601.130181 | 2020 |
| 114433 | WOODLANDS | EXECUTIVE | 1 | 189.0 | Apartment | 75.666667 | 0.508104 | 1 | 0.352256 | 4 | 0.356366 | 0 | 0 | 0 | 18.098446 | 0 | 723610.817982 | 2018 |
| 51028 | SENGKANG | EXECUTIVE | 0 | 140.0 | Maisonette | 79.333333 | 0.872715 | 1 | 0.421567 | 1 | 0.560152 | 0 | 0 | 0 | 11.729111 | 0 | 557172.974601 | 2018 |
217006 rows × 18 columns
data['flat_type'] = le.fit_transform(data['flat_type'])
data
| town | flat_type | storey_range | floor_area_sqm | flat_model | remaining_lease | shortest_dist_mrt | mrt_count_within_1km | shortest_dist_mall | mall_count_within_1km | school_shortest_dist | good_sch_count_within_1km | good_sch_score | good_sch_score_sap_gep | distance_to_cbd_km | matured | adjusted_price | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 139670 | SENGKANG | 0 | 3 | 47.0 | Model A | 92.500000 | 1.679652 | 0 | 0.612240 | 2 | 0.615462 | 1 | 1 | 0 | 12.972290 | 0 | 274890.723652 | 2021 |
| 23425 | JURONG EAST | 0 | 4 | 47.0 | Model A | 92.750000 | 1.198840 | 0 | 1.162130 | 0 | 1.548739 | 0 | 0 | 0 | 13.106423 | 0 | 228981.034397 | 2019 |
| 216836 | GEYLANG | 0 | 2 | 50.0 | Standard | 54.000000 | 0.441429 | 2 | 2.102417 | 0 | 0.728915 | 2 | 2 | 0 | 6.181252 | 1 | 281600.720898 | 2013 |
| 160100 | BUKIT MERAH | 0 | 0 | 49.0 | Standard | 56.000000 | 0.456153 | 1 | 0.483975 | 2 | 0.710211 | 0 | 0 | 0 | 2.309575 | 1 | 295792.054134 | 2013 |
| 23446 | JURONG EAST | 0 | 4 | 47.0 | Model A | 94.333333 | 1.198840 | 0 | 1.162130 | 0 | 1.548739 | 0 | 0 | 0 | 13.106423 | 0 | 232149.138234 | 2018 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 207515 | BISHAN | 4 | 1 | 149.0 | Maisonette | 71.833333 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 861558.235513 | 2019 |
| 207514 | BISHAN | 4 | 1 | 149.0 | Maisonette | 71.250000 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 911141.271806 | 2020 |
| 207512 | BISHAN | 4 | 1 | 146.0 | Maisonette | 70.750000 | 0.556969 | 1 | 0.680325 | 1 | 0.412845 | 1 | 3 | 2 | 7.564648 | 1 | 829601.130181 | 2020 |
| 114433 | WOODLANDS | 4 | 1 | 189.0 | Apartment | 75.666667 | 0.508104 | 1 | 0.352256 | 4 | 0.356366 | 0 | 0 | 0 | 18.098446 | 0 | 723610.817982 | 2018 |
| 51028 | SENGKANG | 4 | 0 | 140.0 | Maisonette | 79.333333 | 0.872715 | 1 | 0.421567 | 1 | 0.560152 | 0 | 0 | 0 | 11.729111 | 0 | 557172.974601 | 2018 |
217006 rows × 18 columns
data['flat_model'].value_counts()
Model A 69776 Improved 54604 New Generation 31053 Premium Apartment 23264 Simplified 9544 Apartment 8413 Standard 6703 Maisonette 6246 DBSS 2943 Model A2 2782 Type S1 401 Model A-Maisonette 386 Adjoined flat 362 Type S2 206 Terrace 132 Premium Apartment Loft 96 2-room 42 Improved-Maisonette 29 Premium Maisonette 17 3Gen 7 Name: flat_model, dtype: int64
# Re-categorize flat models into broader categories
replace_values = {'Model A-Maisonette':'Maisonette', 'Improved-Maisonette':'Maisonette',
'Premium Maisonette':'Maisonette',}
data = data.replace({'flat_model': replace_values})
data['flat_model'].value_counts()
Model A 69776 Improved 54604 New Generation 31053 Premium Apartment 23264 Simplified 9544 Apartment 8413 Standard 6703 Maisonette 6678 DBSS 2943 Model A2 2782 Type S1 401 Adjoined flat 362 Type S2 206 Terrace 132 Premium Apartment Loft 96 2-room 42 3Gen 7 Name: flat_model, dtype: int64
# remove flat models with less than 1000 instances
flat_model_counts = data['flat_model'].value_counts()
data = data[~data['flat_model'].isin(flat_model_counts[flat_model_counts < 1000].index)]
data['flat_model'].value_counts()
Model A 69776 Improved 54604 New Generation 31053 Premium Apartment 23264 Simplified 9544 Apartment 8413 Standard 6703 Maisonette 6678 DBSS 2943 Model A2 2782 Name: flat_model, dtype: int64
data['town'].sort_values().unique()
array(['ANG MO KIO', 'BEDOK', 'BISHAN', 'BUKIT BATOK', 'BUKIT MERAH',
'BUKIT PANJANG', 'BUKIT TIMAH', 'CENTRAL AREA', 'CHOA CHU KANG',
'CLEMENTI', 'GEYLANG', 'HOUGANG', 'JURONG EAST', 'JURONG WEST',
'KALLANG/WHAMPOA', 'MARINE PARADE', 'PASIR RIS', 'PUNGGOL',
'QUEENSTOWN', 'SEMBAWANG', 'SENGKANG', 'SERANGOON', 'TAMPINES',
'TOA PAYOH', 'WOODLANDS', 'YISHUN'], dtype=object)
data['flat_model'].sort_values().unique()
array(['Apartment', 'DBSS', 'Improved', 'Maisonette', 'Model A',
'Model A2', 'New Generation', 'Premium Apartment', 'Simplified',
'Standard'], dtype=object)
data = pd.get_dummies(data, columns = ['town','flat_model'],drop_first = True)
data.sort_index(inplace = True)
data.columns
Index(['flat_type', 'storey_range', 'floor_area_sqm', 'remaining_lease',
'shortest_dist_mrt', 'mrt_count_within_1km', 'shortest_dist_mall',
'mall_count_within_1km', 'school_shortest_dist',
'good_sch_count_within_1km', 'good_sch_score', 'good_sch_score_sap_gep',
'distance_to_cbd_km', 'matured', 'adjusted_price', 'year', 'town_BEDOK',
'town_BISHAN', 'town_BUKIT BATOK', 'town_BUKIT MERAH',
'town_BUKIT PANJANG', 'town_BUKIT TIMAH', 'town_CENTRAL AREA',
'town_CHOA CHU KANG', 'town_CLEMENTI', 'town_GEYLANG', 'town_HOUGANG',
'town_JURONG EAST', 'town_JURONG WEST', 'town_KALLANG/WHAMPOA',
'town_MARINE PARADE', 'town_PASIR RIS', 'town_PUNGGOL',
'town_QUEENSTOWN', 'town_SEMBAWANG', 'town_SENGKANG', 'town_SERANGOON',
'town_TAMPINES', 'town_TOA PAYOH', 'town_WOODLANDS', 'town_YISHUN',
'flat_model_DBSS', 'flat_model_Improved', 'flat_model_Maisonette',
'flat_model_Model A', 'flat_model_Model A2',
'flat_model_New Generation', 'flat_model_Premium Apartment',
'flat_model_Simplified', 'flat_model_Standard'],
dtype='object')
In order to account for multicollinearity, we drop one of the dummy variables.
For the town dummies, Ang Mo Kio is dropped. For the flat_model dummies, Apartment is dropped.
data.dtypes
flat_type int32 storey_range int32 floor_area_sqm float64 remaining_lease float64 shortest_dist_mrt float64 mrt_count_within_1km int64 shortest_dist_mall float64 mall_count_within_1km int64 school_shortest_dist float64 good_sch_count_within_1km int64 good_sch_score int64 good_sch_score_sap_gep int64 distance_to_cbd_km float64 matured int64 adjusted_price float64 year int64 town_BEDOK uint8 town_BISHAN uint8 town_BUKIT BATOK uint8 town_BUKIT MERAH uint8 town_BUKIT PANJANG uint8 town_BUKIT TIMAH uint8 town_CENTRAL AREA uint8 town_CHOA CHU KANG uint8 town_CLEMENTI uint8 town_GEYLANG uint8 town_HOUGANG uint8 town_JURONG EAST uint8 town_JURONG WEST uint8 town_KALLANG/WHAMPOA uint8 town_MARINE PARADE uint8 town_PASIR RIS uint8 town_PUNGGOL uint8 town_QUEENSTOWN uint8 town_SEMBAWANG uint8 town_SENGKANG uint8 town_SERANGOON uint8 town_TAMPINES uint8 town_TOA PAYOH uint8 town_WOODLANDS uint8 town_YISHUN uint8 flat_model_DBSS uint8 flat_model_Improved uint8 flat_model_Maisonette uint8 flat_model_Model A uint8 flat_model_Model A2 uint8 flat_model_New Generation uint8 flat_model_Premium Apartment uint8 flat_model_Simplified uint8 flat_model_Standard uint8 dtype: object
data['flat_type'] = data['flat_type'].astype("int64")
data['storey_range'] = data['storey_range'].astype("int64")
import seaborn as sns
fig = plt.figure(figsize=(10,10))
ax = sns.heatmap(data.select_dtypes(include=['int64','float64']).corr(), annot = True, fmt='.2g',
cmap = 'coolwarm', linewidth=0.5, annot_kws={"size": 8})
plt.xticks(rotation=45, ha='right')
fig.show()
C:\Users\raych\AppData\Local\Temp\ipykernel_16440\1764189709.py:7: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure. fig.show()
High correlation (>0.6) between:
Apart from correlation, we also check the VIF scores. Generally, if the largest VIF score is greater than 10 (Bowerman & O'Connell, 1990), and a tolerance below 0.1 (Menard, 1995) indicates a likely issue of the regression being biased.
# Multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
vif = pd.DataFrame()
vif["variables"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['tolerance'] = 1/vif.VIF
vif['meanVIF'] = vif.VIF.mean()
return(vif)
vif_df = calc_vif(data.drop('adjusted_price',axis=1).select_dtypes(include=['int64','float64']))
vif_df
| variables | VIF | tolerance | meanVIF | |
|---|---|---|---|---|
| 0 | flat_type | 73.623977 | 0.013583 | 39.471484 |
| 1 | storey_range | 2.696498 | 0.370851 | 39.471484 |
| 2 | floor_area_sqm | 199.542799 | 0.005011 | 39.471484 |
| 3 | remaining_lease | 53.627204 | 0.018647 | 39.471484 |
| 4 | shortest_dist_mrt | 7.830648 | 0.127703 | 39.471484 |
| 5 | mrt_count_within_1km | 4.872612 | 0.205229 | 39.471484 |
| 6 | shortest_dist_mall | 7.681541 | 0.130182 | 39.471484 |
| 7 | mall_count_within_1km | 4.563306 | 0.219139 | 39.471484 |
| 8 | school_shortest_dist | 3.999156 | 0.250053 | 39.471484 |
| 9 | good_sch_count_within_1km | 17.961900 | 0.055673 | 39.471484 |
| 10 | good_sch_score | 26.422103 | 0.037847 | 39.471484 |
| 11 | good_sch_score_sap_gep | 4.391022 | 0.227737 | 39.471484 |
| 12 | distance_to_cbd_km | 23.328839 | 0.042865 | 39.471484 |
| 13 | matured | 4.198440 | 0.238184 | 39.471484 |
| 14 | year | 157.332215 | 0.006356 | 39.471484 |
between flat_type and floor_area_sqm, we will drop flat_type as floor_area_sqm could have more potential for feature explanation as it is a continuous variable, while flat_type has been label encoded.
For the mrt, mall, sch features, we will choose to drop the features that measure the count within 1km first, for reasons being the shortest distance features being continuous.
Between distance to cbd and matured features, we will drop matured as distance to cbd is a continous variable as well.
Between good_sch_score_sap_gep and good_sch_score, we will just drop good_sch_score_sap_gep for the time being. Depending on the results of the good_sch_score feature results, we will decide if there is reason to run the models again but with good_sch_score_sap_gep instead.
year variable is dropped as well, as we have already accounted for resale price inflation using CPI values. This should account for majority of the impact that year has on resale prices, due to the fact that generally why prices increases over time is due to inflation. It is also dropped for model simplicity.
dropped_features = ['mrt_count_within_1km','mall_count_within_1km',
'matured','good_sch_count_within_1km','flat_type','good_sch_score_sap_gep',
'year']
calc_vif(data.drop(dropped_features,axis=1).
drop('adjusted_price', axis = 1).
select_dtypes(include=['int64','float64']))
| variables | VIF | tolerance | meanVIF | |
|---|---|---|---|---|
| 0 | storey_range | 2.677816 | 0.373439 | 9.206758 |
| 1 | floor_area_sqm | 17.208520 | 0.058111 | 9.206758 |
| 2 | remaining_lease | 27.905570 | 0.035835 | 9.206758 |
| 3 | shortest_dist_mrt | 4.286971 | 0.233265 | 9.206758 |
| 4 | shortest_dist_mall | 3.817710 | 0.261937 | 9.206758 |
| 5 | school_shortest_dist | 3.576506 | 0.279602 | 9.206758 |
| 6 | good_sch_score | 1.556816 | 0.642337 | 9.206758 |
| 7 | distance_to_cbd_km | 12.624152 | 0.079213 | 9.206758 |
We observe that remaining_lease and floor_area_sqm still has a very high VIF score, but we shall still keep these features as we feel they are important explainators of hdb resale price.
def model_performance(model, X_train, X_test, y_train, y_test):
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('R^2 score =', r2)
print('MAE:', mae)
print('RMSE', rmse)
return(r2, mae, rmse)
df_1 = data.drop(columns = dropped_features)
# Split target variable from dataset
y= df_1['adjusted_price']
X = df_1.drop(columns='adjusted_price')
X
| storey_range | floor_area_sqm | remaining_lease | shortest_dist_mrt | shortest_dist_mall | school_shortest_dist | good_sch_score | distance_to_cbd_km | town_BEDOK | town_BISHAN | ... | town_YISHUN | flat_model_DBSS | flat_model_Improved | flat_model_Maisonette | flat_model_Model A | flat_model_Model A2 | flat_model_New Generation | flat_model_Premium Apartment | flat_model_Simplified | flat_model_Standard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7 | 103.0 | 74.333333 | 0.754215 | 0.824707 | 0.214393 | 3 | 7.635584 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 1 | 7 | 102.0 | 74.583333 | 0.754215 | 0.824707 | 0.214393 | 3 | 7.635584 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 2 | 5 | 102.0 | 74.666667 | 0.754215 | 0.824707 | 0.214393 | 3 | 7.635584 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 3 | 2 | 121.0 | 75.083333 | 0.754215 | 0.824707 | 0.214393 | 3 | 7.635584 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 4 | 2 | 102.0 | 75.583333 | 0.754215 | 0.824707 | 0.214393 | 3 | 7.635584 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 217186 | 2 | 56.0 | 59.000000 | 0.515838 | 1.203264 | 0.173506 | 0 | 5.895374 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 217187 | 3 | 56.0 | 55.000000 | 0.369882 | 1.192208 | 0.321240 | 0 | 6.199401 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 217188 | 0 | 69.0 | 66.000000 | 0.749353 | 0.788194 | 0.324541 | 0 | 9.498989 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 217189 | 0 | 142.0 | 81.000000 | 1.259868 | 0.861002 | 0.237382 | 0 | 10.127127 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 217190 | 1 | 67.0 | 70.000000 | 0.240490 | 0.183585 | 0.526720 | 2 | 1.428059 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
215760 rows × 42 columns
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=100)
print(len(X_train), len(X_test))
print(len(y_train), len(y_test))
172608 43152 172608 43152
# Scale dataset
continuous_features = ['floor_area_sqm', 'remaining_lease','shortest_dist_mrt',
'shortest_dist_mall', 'school_shortest_dist', 'distance_to_cbd_km']
scaler = StandardScaler()
X_train_continuous = X_train[continuous_features]
X_train_continuous = scaler.fit_transform(X_train_continuous)
X_test_continuous = X_test[continuous_features]
X_test_continuous = scaler.transform(X_test_continuous)
# Replace original continuous features with scaled ones in X
X_train[continuous_features] = X_train_continuous
X_test[continuous_features] = X_test_continuous
X_train
| storey_range | floor_area_sqm | remaining_lease | shortest_dist_mrt | shortest_dist_mall | school_shortest_dist | good_sch_score | distance_to_cbd_km | town_BEDOK | town_BISHAN | ... | town_YISHUN | flat_model_DBSS | flat_model_Improved | flat_model_Maisonette | flat_model_Model A | flat_model_Model A2 | flat_model_New Generation | flat_model_Premium Apartment | flat_model_Simplified | flat_model_Standard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 15580 | 2 | -0.998557 | -0.911747 | -0.219190 | 0.145591 | 0.417011 | 2 | -0.598740 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 79337 | 2 | 0.453227 | -1.049132 | -1.181970 | 0.474732 | 0.200623 | 0 | 0.318094 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 100839 | 2 | 0.287309 | -0.283703 | 0.397181 | 0.349683 | 1.178963 | 0 | 1.056425 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 40938 | 3 | -0.168966 | -0.643520 | -0.384295 | -1.256916 | -0.553823 | 0 | 0.920318 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 105408 | 2 | -0.210445 | -1.068758 | 2.083922 | -0.323151 | 0.520961 | 1 | -0.560811 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66318 | 0 | -0.915598 | -0.597725 | 0.288099 | 0.549116 | 0.960116 | 1 | -0.764001 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 78499 | 1 | 0.328789 | -0.322956 | -1.062610 | 1.572633 | 0.763628 | 0 | 0.444479 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 212109 | 0 | 1.158380 | 0.409762 | -0.787881 | -0.678571 | -1.211267 | 0 | 0.043268 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 56758 | 2 | 0.287309 | -0.761278 | -0.504996 | 0.147404 | -0.061300 | 3 | -0.470074 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 39004 | 0 | -1.247434 | -2.148209 | -0.174589 | 1.173675 | -0.609230 | 1 | -1.576328 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
172608 rows × 42 columns
X_test
| storey_range | floor_area_sqm | remaining_lease | shortest_dist_mrt | shortest_dist_mall | school_shortest_dist | good_sch_score | distance_to_cbd_km | town_BEDOK | town_BISHAN | ... | town_YISHUN | flat_model_DBSS | flat_model_Improved | flat_model_Maisonette | flat_model_Model A | flat_model_Model A2 | flat_model_New Generation | flat_model_Premium Apartment | flat_model_Simplified | flat_model_Standard | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 171170 | 0 | 0.245830 | -0.538846 | -0.109694 | 0.055119 | -1.027764 | 1 | 0.282983 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 170486 | 3 | -1.205954 | -0.754736 | -1.043354 | 0.588785 | 0.353329 | 0 | -0.419751 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 142664 | 4 | 1.241339 | -0.100524 | 1.076450 | -0.041177 | -0.493227 | 0 | 0.801918 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 166731 | 3 | -0.293404 | 1.018179 | -0.945443 | -0.099957 | -0.632235 | 0 | -0.056253 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 97584 | 2 | -0.625241 | -1.009879 | -0.673045 | -0.401977 | -1.060400 | 0 | 0.985456 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 203232 | 3 | 1.324298 | 0.265835 | -0.468724 | -0.810904 | -1.057700 | 0 | 0.462694 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 12918 | 0 | -0.210445 | 1.508839 | -0.103458 | 0.077979 | -1.112328 | 0 | 0.134484 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 211177 | 1 | -0.168966 | 1.149022 | 0.826356 | -0.614291 | -1.239370 | 0 | 0.494265 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
| 8013 | 2 | -0.874118 | -1.978113 | -1.315470 | 0.417498 | 0.366404 | 0 | -1.380488 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 97117 | 1 | -0.251925 | -0.676231 | 0.336581 | -0.510823 | 2.297689 | 0 | -0.436026 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
43152 rows × 42 columns
We have shortlisted the following models to understand the drivers of HDB resale prices:
Linear regression is shortlisted as it provides a useful way to interpret the respective impacts that each feature has on the hdb resale price. We will be able to infer directionality of the impacts as well.
We want to also compare the performances of ensemble methods, which are non-parametric in nature. Specifically, we shortlisted one bagging (bootstrap aggregating) method, which will be Random Forest, and one boosting method, which will be XGBoost. From this, we will compare the performances between the models and investigate feature importance.
lr = LinearRegression()
lin_reg_r2, lin_reg_mae, lin_reg_rmse = model_performance(lr, X_train, X_test, y_train, y_test)
R^2 score = 0.8470137964884951 MAE: 45945.24173236661 RMSE 58388.14949141931
Let's plot the coefficients and observe their respective impacts on hdb resale price
lr_importance = pd.DataFrame(list(zip(X_train.columns, lr.coef_)), columns =['Features', 'Coeff'])
lr_importance.sort_values(by = 'Coeff', ascending = False)
| Features | Coeff | |
|---|---|---|
| 22 | town_MARINE PARADE | 182876.155867 |
| 13 | town_BUKIT TIMAH | 180867.128541 |
| 1 | floor_area_sqm | 100569.917762 |
| 33 | flat_model_DBSS | 95791.020966 |
| 9 | town_BISHAN | 71224.237114 |
| 2 | remaining_lease | 61827.776129 |
| 25 | town_QUEENSTOWN | 59032.444003 |
| 35 | flat_model_Maisonette | 37567.596310 |
| 16 | town_CLEMENTI | 34168.057556 |
| 11 | town_BUKIT MERAH | 33369.508381 |
| 0 | storey_range | 13919.648850 |
| 30 | town_TOA PAYOH | 13294.281834 |
| 8 | town_BEDOK | 11999.701502 |
| 5 | school_shortest_dist | 5098.324859 |
| 28 | town_SERANGOON | 1868.748304 |
| 29 | town_TAMPINES | 354.860367 |
| 6 | good_sch_score | -3401.334753 |
| 17 | town_GEYLANG | -7511.513373 |
| 4 | shortest_dist_mall | -8810.871184 |
| 40 | flat_model_Simplified | -10272.252212 |
| 19 | town_JURONG EAST | -10637.936291 |
| 41 | flat_model_Standard | -11519.127836 |
| 21 | town_KALLANG/WHAMPOA | -13187.320606 |
| 38 | flat_model_New Generation | -14817.420165 |
| 14 | town_CENTRAL AREA | -18189.962034 |
| 39 | flat_model_Premium Apartment | -20284.436499 |
| 3 | shortest_dist_mrt | -22405.357487 |
| 37 | flat_model_Model A2 | -27750.356810 |
| 32 | town_YISHUN | -28937.318258 |
| 34 | flat_model_Improved | -29252.274147 |
| 23 | town_PASIR RIS | -32199.129649 |
| 36 | flat_model_Model A | -33145.420563 |
| 10 | town_BUKIT BATOK | -33233.740279 |
| 24 | town_PUNGGOL | -33665.494955 |
| 20 | town_JURONG WEST | -37587.960389 |
| 7 | distance_to_cbd_km | -49035.822412 |
| 18 | town_HOUGANG | -54344.617976 |
| 12 | town_BUKIT PANJANG | -54911.337758 |
| 31 | town_WOODLANDS | -63801.355403 |
| 26 | town_SEMBAWANG | -79632.487769 |
| 27 | town_SENGKANG | -86226.465121 |
| 15 | town_CHOA CHU KANG | -89797.790343 |
Let's also plot the OLS regression results, to look at the significance of the features.
X_constant = sm.add_constant(X_train)
lr_ols = sm.OLS(y_train,X_constant).fit()
print(lr_ols.summary())
OLS Regression Results
==============================================================================
Dep. Variable: adjusted_price R-squared: 0.846
Model: OLS Adj. R-squared: 0.846
Method: Least Squares F-statistic: 2.263e+04
Date: Wed, 05 Apr 2023 Prob (F-statistic): 0.00
Time: 11:46:14 Log-Likelihood: -2.1384e+06
No. Observations: 172608 AIC: 4.277e+06
Df Residuals: 172565 BIC: 4.277e+06
Df Model: 42
Covariance Type: nonrobust
================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------
const 4.79e+05 1230.401 389.274 0.000 4.77e+05 4.81e+05
storey_range 1.392e+04 82.346 169.039 0.000 1.38e+04 1.41e+04
floor_area_sqm 1.006e+05 202.379 496.939 0.000 1e+05 1.01e+05
remaining_lease 6.183e+04 220.299 280.654 0.000 6.14e+04 6.23e+04
shortest_dist_mrt -2.241e+04 171.441 -130.688 0.000 -2.27e+04 -2.21e+04
shortest_dist_mall -8810.8712 172.376 -51.114 0.000 -9148.725 -8473.018
school_shortest_dist 5098.3249 157.332 32.405 0.000 4789.958 5406.691
good_sch_score -3401.3348 209.749 -16.216 0.000 -3812.437 -2990.232
distance_to_cbd_km -4.904e+04 767.092 -63.924 0.000 -5.05e+04 -4.75e+04
town_BEDOK 1.2e+04 920.130 13.041 0.000 1.02e+04 1.38e+04
town_BISHAN 7.122e+04 1378.627 51.663 0.000 6.85e+04 7.39e+04
town_BUKIT BATOK -3.323e+04 1265.046 -26.271 0.000 -3.57e+04 -3.08e+04
town_BUKIT MERAH 3.337e+04 1445.113 23.091 0.000 3.05e+04 3.62e+04
town_BUKIT PANJANG -5.491e+04 1395.648 -39.345 0.000 -5.76e+04 -5.22e+04
town_BUKIT TIMAH 1.809e+05 2997.299 60.343 0.000 1.75e+05 1.87e+05
town_CENTRAL AREA -1.819e+04 2408.025 -7.554 0.000 -2.29e+04 -1.35e+04
town_CHOA CHU KANG -8.98e+04 1605.270 -55.939 0.000 -9.29e+04 -8.67e+04
town_CLEMENTI 3.417e+04 1155.843 29.561 0.000 3.19e+04 3.64e+04
town_GEYLANG -7511.5134 1346.014 -5.581 0.000 -1.01e+04 -4873.357
town_HOUGANG -5.434e+04 1023.000 -53.123 0.000 -5.63e+04 -5.23e+04
town_JURONG EAST -1.064e+04 1422.790 -7.477 0.000 -1.34e+04 -7849.299
town_JURONG WEST -3.759e+04 1693.108 -22.201 0.000 -4.09e+04 -3.43e+04
town_KALLANG/WHAMPOA -1.319e+04 1466.116 -8.995 0.000 -1.61e+04 -1.03e+04
town_MARINE PARADE 1.829e+05 1992.567 91.779 0.000 1.79e+05 1.87e+05
town_PASIR RIS -3.22e+04 1459.241 -22.066 0.000 -3.51e+04 -2.93e+04
town_PUNGGOL -3.367e+04 1352.125 -24.898 0.000 -3.63e+04 -3.1e+04
town_QUEENSTOWN 5.903e+04 1265.368 46.652 0.000 5.66e+04 6.15e+04
town_SEMBAWANG -7.963e+04 2014.688 -39.526 0.000 -8.36e+04 -7.57e+04
town_SENGKANG -8.623e+04 1107.736 -77.840 0.000 -8.84e+04 -8.41e+04
town_SERANGOON 1868.7483 1251.971 1.493 0.136 -585.086 4322.583
town_TAMPINES 354.8604 1129.776 0.314 0.753 -1859.475 2569.196
town_TOA PAYOH 1.329e+04 1314.233 10.116 0.000 1.07e+04 1.59e+04
town_WOODLANDS -6.38e+04 1813.139 -35.188 0.000 -6.74e+04 -6.02e+04
town_YISHUN -2.894e+04 1438.108 -20.122 0.000 -3.18e+04 -2.61e+04
flat_model_DBSS 9.579e+04 1515.217 63.219 0.000 9.28e+04 9.88e+04
flat_model_Improved -2.925e+04 834.748 -35.043 0.000 -3.09e+04 -2.76e+04
flat_model_Maisonette 3.757e+04 1083.238 34.681 0.000 3.54e+04 3.97e+04
flat_model_Model A -3.315e+04 864.679 -38.333 0.000 -3.48e+04 -3.15e+04
flat_model_Model A2 -2.775e+04 1510.256 -18.375 0.000 -3.07e+04 -2.48e+04
flat_model_New Generation -1.482e+04 1000.321 -14.813 0.000 -1.68e+04 -1.29e+04
flat_model_Premium Apartment -2.028e+04 929.027 -21.834 0.000 -2.21e+04 -1.85e+04
flat_model_Simplified -1.027e+04 1135.871 -9.044 0.000 -1.25e+04 -8045.970
flat_model_Standard -1.152e+04 1229.802 -9.367 0.000 -1.39e+04 -9108.744
==============================================================================
Omnibus: 9969.465 Durbin-Watson: 1.997
Prob(Omnibus): 0.000 Jarque-Bera (JB): 13413.395
Skew: 0.540 Prob(JB): 0.00
Kurtosis: 3.836 Cond. No. 106.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
lr_importance['Coeff_ABS'] = np.abs(lr_importance['Coeff'])
lr_importance = lr_importance.sort_values('Coeff_ABS', ascending=False).reset_index(drop= True)
lr_importance['color'] = lr_importance['Coeff'].apply(lambda x: '#66ff8c' if x>0 else '#ff8c66')
lr_importance
| Features | Coeff | Coeff_ABS | color | |
|---|---|---|---|---|
| 0 | town_MARINE PARADE | 182876.155867 | 182876.155867 | #66ff8c |
| 1 | town_BUKIT TIMAH | 180867.128541 | 180867.128541 | #66ff8c |
| 2 | floor_area_sqm | 100569.917762 | 100569.917762 | #66ff8c |
| 3 | flat_model_DBSS | 95791.020966 | 95791.020966 | #66ff8c |
| 4 | town_CHOA CHU KANG | -89797.790343 | 89797.790343 | #ff8c66 |
| 5 | town_SENGKANG | -86226.465121 | 86226.465121 | #ff8c66 |
| 6 | town_SEMBAWANG | -79632.487769 | 79632.487769 | #ff8c66 |
| 7 | town_BISHAN | 71224.237114 | 71224.237114 | #66ff8c |
| 8 | town_WOODLANDS | -63801.355403 | 63801.355403 | #ff8c66 |
| 9 | remaining_lease | 61827.776129 | 61827.776129 | #66ff8c |
| 10 | town_QUEENSTOWN | 59032.444003 | 59032.444003 | #66ff8c |
| 11 | town_BUKIT PANJANG | -54911.337758 | 54911.337758 | #ff8c66 |
| 12 | town_HOUGANG | -54344.617976 | 54344.617976 | #ff8c66 |
| 13 | distance_to_cbd_km | -49035.822412 | 49035.822412 | #ff8c66 |
| 14 | town_JURONG WEST | -37587.960389 | 37587.960389 | #ff8c66 |
| 15 | flat_model_Maisonette | 37567.596310 | 37567.596310 | #66ff8c |
| 16 | town_CLEMENTI | 34168.057556 | 34168.057556 | #66ff8c |
| 17 | town_PUNGGOL | -33665.494955 | 33665.494955 | #ff8c66 |
| 18 | town_BUKIT MERAH | 33369.508381 | 33369.508381 | #66ff8c |
| 19 | town_BUKIT BATOK | -33233.740279 | 33233.740279 | #ff8c66 |
| 20 | flat_model_Model A | -33145.420563 | 33145.420563 | #ff8c66 |
| 21 | town_PASIR RIS | -32199.129649 | 32199.129649 | #ff8c66 |
| 22 | flat_model_Improved | -29252.274147 | 29252.274147 | #ff8c66 |
| 23 | town_YISHUN | -28937.318258 | 28937.318258 | #ff8c66 |
| 24 | flat_model_Model A2 | -27750.356810 | 27750.356810 | #ff8c66 |
| 25 | shortest_dist_mrt | -22405.357487 | 22405.357487 | #ff8c66 |
| 26 | flat_model_Premium Apartment | -20284.436499 | 20284.436499 | #ff8c66 |
| 27 | town_CENTRAL AREA | -18189.962034 | 18189.962034 | #ff8c66 |
| 28 | flat_model_New Generation | -14817.420165 | 14817.420165 | #ff8c66 |
| 29 | storey_range | 13919.648850 | 13919.648850 | #66ff8c |
| 30 | town_TOA PAYOH | 13294.281834 | 13294.281834 | #66ff8c |
| 31 | town_KALLANG/WHAMPOA | -13187.320606 | 13187.320606 | #ff8c66 |
| 32 | town_BEDOK | 11999.701502 | 11999.701502 | #66ff8c |
| 33 | flat_model_Standard | -11519.127836 | 11519.127836 | #ff8c66 |
| 34 | town_JURONG EAST | -10637.936291 | 10637.936291 | #ff8c66 |
| 35 | flat_model_Simplified | -10272.252212 | 10272.252212 | #ff8c66 |
| 36 | shortest_dist_mall | -8810.871184 | 8810.871184 | #ff8c66 |
| 37 | town_GEYLANG | -7511.513373 | 7511.513373 | #ff8c66 |
| 38 | school_shortest_dist | 5098.324859 | 5098.324859 | #66ff8c |
| 39 | good_sch_score | -3401.334753 | 3401.334753 | #ff8c66 |
| 40 | town_SERANGOON | 1868.748304 | 1868.748304 | #66ff8c |
| 41 | town_TAMPINES | 354.860367 | 354.860367 | #66ff8c |
fig, ax = plt.subplots(figsize=(8,8))
lr_importance_graph = lr_importance.head(10)
graph = ax.barh(lr_importance_graph['Features'], lr_importance_graph['Coeff_ABS'],
color=lr_importance_graph['color'])
colors = {'Positive':'#66ff8c', 'Negative':'#ff8c66'}
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels, title='Relationship',loc='lower right', fontsize ='10')
plt.title('Linear Regression Coefficients (Top 10)- 2013 to 2023', size = 15)
plt.ylabel('Features', size = 12)
plt.xlabel('Coefficient', size = 12)
ax.bar_label(graph, fmt='%.0f', padding =10)
ax.set_xlim(right=220000)
ax.invert_yaxis()
plt.show()
fig, ax = plt.subplots(figsize=(10,10))
graph = ax.barh(lr_importance['Features'], lr_importance['Coeff_ABS'],
color=lr_importance['color'])
colors = {'Positive':'#66ff8c', 'Negative':'#ff8c66'}
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels, title='Relationship',loc='lower right', fontsize ='10')
plt.title('Linear Regression Coefficients- 2013 to 2023', size = 15)
plt.ylabel('Features', size = 12)
plt.xlabel('Coefficient', size = 12)
ax.bar_label(graph, fmt='%.0f', padding =10)
ax.set_xlim(right=200000)
ax.invert_yaxis()
plt.show()
Computing the default parameter model
rf = RandomForestRegressor(random_state = 100)
rf_r2, rf_mae, rf_rmse = model_performance(rf, X_train, X_test, y_train, y_test)
R^2 score = 0.9648398678215069 MAE: 20192.897304058253 RMSE 27991.348871304246
Training the default random forest model takes about 6-7 minutes.
rf_importance = pd.DataFrame(list(zip(rf.feature_names_in_, rf.feature_importances_)), columns =['Features', 'Coeff'])
rf_importance = rf_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
rf_importance
| Features | Coeff | |
|---|---|---|
| 0 | floor_area_sqm | 0.496788 |
| 1 | distance_to_cbd_km | 0.239694 |
| 2 | remaining_lease | 0.117988 |
| 3 | storey_range | 0.040960 |
| 4 | shortest_dist_mrt | 0.026672 |
| 5 | shortest_dist_mall | 0.020307 |
| 6 | school_shortest_dist | 0.009732 |
| 7 | flat_model_DBSS | 0.008401 |
| 8 | good_sch_score | 0.004227 |
| 9 | town_BUKIT TIMAH | 0.003881 |
| 10 | flat_model_Model A | 0.003815 |
| 11 | flat_model_Improved | 0.002632 |
| 12 | town_TAMPINES | 0.002521 |
| 13 | town_HOUGANG | 0.002267 |
| 14 | town_CLEMENTI | 0.001995 |
| 15 | town_BISHAN | 0.001864 |
| 16 | town_JURONG WEST | 0.001468 |
| 17 | town_BUKIT PANJANG | 0.001261 |
| 18 | flat_model_Premium Apartment | 0.001193 |
| 19 | town_PUNGGOL | 0.001174 |
| 20 | town_SENGKANG | 0.001156 |
| 21 | town_BEDOK | 0.000914 |
| 22 | town_PASIR RIS | 0.000871 |
| 23 | town_JURONG EAST | 0.000833 |
| 24 | town_BUKIT BATOK | 0.000828 |
| 25 | town_BUKIT MERAH | 0.000823 |
| 26 | town_GEYLANG | 0.000810 |
| 27 | town_QUEENSTOWN | 0.000807 |
| 28 | town_MARINE PARADE | 0.000720 |
| 29 | town_KALLANG/WHAMPOA | 0.000506 |
| 30 | town_CHOA CHU KANG | 0.000459 |
| 31 | flat_model_New Generation | 0.000366 |
| 32 | town_YISHUN | 0.000316 |
| 33 | flat_model_Maisonette | 0.000307 |
| 34 | town_SERANGOON | 0.000272 |
| 35 | town_TOA PAYOH | 0.000251 |
| 36 | town_WOODLANDS | 0.000227 |
| 37 | flat_model_Simplified | 0.000202 |
| 38 | town_CENTRAL AREA | 0.000193 |
| 39 | flat_model_Standard | 0.000153 |
| 40 | town_SEMBAWANG | 0.000100 |
| 41 | flat_model_Model A2 | 0.000048 |
fig, ax = plt.subplots(figsize=(6,6))
rf_importance_graph = rf_importance.head(10)
graph = ax.barh(rf_importance_graph['Features'], rf_importance_graph['Coeff'])
plt.title('Random Forest Feature Importance (Top 10) - 2013 to 2023', size = 13)
plt.annotate(('Test R\u00b2: ' + str(round(rf_r2,3)) + '\nTest MAE: ' + str(round(rf_mae))+
'\nTest RMSE: ' + str(round(rf_rmse))),
xy=(0.4,8), fontsize = 10)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.65)
ax.invert_yaxis()
plt.show()
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
}
print(random_grid)
{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf1 = RandomForestRegressor(random_state = 100)
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf1, param_distributions = random_grid, n_iter = 2, cv = 3, verbose=2, random_state=100)
# Fit the random search model
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
Fitting 3 folds for each of 2 candidates, totalling 6 fits [CV] END max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time=10.4min [CV] END max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time= 5.7min [CV] END max_depth=90, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1000; total time= 9.1min
c:\Users\raych\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py:416: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=110, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=79.6min
c:\Users\raych\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py:416: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=110, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=64.8min
c:\Users\raych\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py:416: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
[CV] END max_depth=110, max_features=auto, min_samples_leaf=2, min_samples_split=10, n_estimators=2000; total time=105.8min
c:\Users\raych\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\ensemble\_forest.py:416: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features=1.0` or remove this parameter as it is also the default value for RandomForestRegressors and ExtraTreesRegressors. warn(
{'n_estimators': 2000, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 110}
y_pred = rf_random.predict(X_test)
rf_random_r2 = r2_score(y_test, y_pred)
rf_random_mae = mean_absolute_error(y_test, y_pred)
rf_random_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('R^2 score =', rf_random_r2)
print('MAE:', rf_random_mae)
print('RMSE', rf_random_rmse)
R^2 score = 0.9633581394113007 MAE: 20651.524106641013 RMSE 28575.072214351734
rf_random_importance = pd.DataFrame(list(zip(rf_random.feature_names_in_, rf_random.best_estimator_.feature_importances_)),
columns =['Features', 'Coeff'])
rf_random_importance = rf_random_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
rf_random_importance
| Features | Coeff | |
|---|---|---|
| 0 | floor_area_sqm | 0.498620 |
| 1 | distance_to_cbd_km | 0.246664 |
| 2 | remaining_lease | 0.111466 |
| 3 | storey_range | 0.040884 |
| 4 | shortest_dist_mrt | 0.025590 |
| 5 | shortest_dist_mall | 0.017852 |
| 6 | flat_model_DBSS | 0.008522 |
| 7 | school_shortest_dist | 0.008298 |
| 8 | flat_model_Model A | 0.005222 |
| 9 | town_BUKIT TIMAH | 0.004185 |
| 10 | good_sch_score | 0.003942 |
| 11 | flat_model_Improved | 0.003249 |
| 12 | town_TAMPINES | 0.002798 |
| 13 | town_BISHAN | 0.002433 |
| 14 | town_HOUGANG | 0.002238 |
| 15 | town_CLEMENTI | 0.001877 |
| 16 | town_JURONG WEST | 0.001713 |
| 17 | town_SENGKANG | 0.001330 |
| 18 | town_PUNGGOL | 0.001210 |
| 19 | town_BUKIT PANJANG | 0.001195 |
| 20 | flat_model_Premium Apartment | 0.001036 |
| 21 | town_PASIR RIS | 0.000917 |
| 22 | town_BEDOK | 0.000888 |
| 23 | town_BUKIT BATOK | 0.000829 |
| 24 | town_MARINE PARADE | 0.000827 |
| 25 | town_BUKIT MERAH | 0.000800 |
| 26 | town_JURONG EAST | 0.000756 |
| 27 | town_QUEENSTOWN | 0.000726 |
| 28 | town_GEYLANG | 0.000691 |
| 29 | town_KALLANG/WHAMPOA | 0.000437 |
| 30 | town_CHOA CHU KANG | 0.000399 |
| 31 | flat_model_New Generation | 0.000327 |
| 32 | town_YISHUN | 0.000311 |
| 33 | town_TOA PAYOH | 0.000247 |
| 34 | flat_model_Maisonette | 0.000247 |
| 35 | flat_model_Standard | 0.000243 |
| 36 | flat_model_Simplified | 0.000227 |
| 37 | town_CENTRAL AREA | 0.000215 |
| 38 | town_SERANGOON | 0.000210 |
| 39 | town_WOODLANDS | 0.000204 |
| 40 | town_SEMBAWANG | 0.000088 |
| 41 | flat_model_Model A2 | 0.000087 |
Plotting feature importance for Random Forest with Cross Validation
fig, ax = plt.subplots(figsize=(10,10))
rf_random_importance_graph = rf_random_importance.head(20)
graph = ax.barh(rf_random_importance_graph['Features'], rf_random_importance_graph['Coeff'])
plt.title('Random Forest Feature Importance w CV (Top 20) - 2013 to 2023', size = 13)
plt.annotate(('Test R\u00b2: ' + str(round(rf_random_r2,3)) + '\nTest MAE: ' + str(round(rf_random_mae))+
'\nTest RMSE: ' + str(round(rf_random_rmse))),
xy=(0.4,18), fontsize = 13)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.65)
ax.invert_yaxis()
plt.show()
oob_model = RandomForestRegressor(oob_score=True, random_state = 100)
oob_r2, oob_mae, oob_rmse = model_performance(oob_model, X_train, X_test, y_train, y_test)
R^2 score = 0.9648398678215069 MAE: 20192.897304058253 RMSE 27991.348871304246
oob_model.oob_score_
0.9636860036453673
oob_importance = pd.DataFrame(list(zip(oob_model.feature_names_in_, oob_model.feature_importances_)), columns =['Features', 'Coeff'])
oob_importance = oob_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
oob_importance
| Features | Coeff | |
|---|---|---|
| 0 | floor_area_sqm | 0.496788 |
| 1 | distance_to_cbd_km | 0.239694 |
| 2 | remaining_lease | 0.117988 |
| 3 | storey_range | 0.040960 |
| 4 | shortest_dist_mrt | 0.026672 |
| 5 | shortest_dist_mall | 0.020307 |
| 6 | school_shortest_dist | 0.009732 |
| 7 | flat_model_DBSS | 0.008401 |
| 8 | good_sch_score | 0.004227 |
| 9 | town_BUKIT TIMAH | 0.003881 |
| 10 | flat_model_Model A | 0.003815 |
| 11 | flat_model_Improved | 0.002632 |
| 12 | town_TAMPINES | 0.002521 |
| 13 | town_HOUGANG | 0.002267 |
| 14 | town_CLEMENTI | 0.001995 |
| 15 | town_BISHAN | 0.001864 |
| 16 | town_JURONG WEST | 0.001468 |
| 17 | town_BUKIT PANJANG | 0.001261 |
| 18 | flat_model_Premium Apartment | 0.001193 |
| 19 | town_PUNGGOL | 0.001174 |
| 20 | town_SENGKANG | 0.001156 |
| 21 | town_BEDOK | 0.000914 |
| 22 | town_PASIR RIS | 0.000871 |
| 23 | town_JURONG EAST | 0.000833 |
| 24 | town_BUKIT BATOK | 0.000828 |
| 25 | town_BUKIT MERAH | 0.000823 |
| 26 | town_GEYLANG | 0.000810 |
| 27 | town_QUEENSTOWN | 0.000807 |
| 28 | town_MARINE PARADE | 0.000720 |
| 29 | town_KALLANG/WHAMPOA | 0.000506 |
| 30 | town_CHOA CHU KANG | 0.000459 |
| 31 | flat_model_New Generation | 0.000366 |
| 32 | town_YISHUN | 0.000316 |
| 33 | flat_model_Maisonette | 0.000307 |
| 34 | town_SERANGOON | 0.000272 |
| 35 | town_TOA PAYOH | 0.000251 |
| 36 | town_WOODLANDS | 0.000227 |
| 37 | flat_model_Simplified | 0.000202 |
| 38 | town_CENTRAL AREA | 0.000193 |
| 39 | flat_model_Standard | 0.000153 |
| 40 | town_SEMBAWANG | 0.000100 |
| 41 | flat_model_Model A2 | 0.000048 |
Out of Bag Validation Feature Importance
fig, ax = plt.subplots(figsize=(10,10))
oob_importance_graph = oob_importance.head(20)
graph = ax.barh(oob_importance_graph['Features'], oob_importance_graph['Coeff'])
plt.title('Out of Bag Feature Importance (Top 20) - 2013 to 2023', size = 13)
plt.annotate(('Test R\u00b2: ' + str(round(oob_r2,3)) + '\nTest MAE: ' + str(round(oob_mae))+
'\nTest RMSE: ' + str(round(oob_rmse))),
xy=(0.4,18), fontsize = 13)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.65)
ax.invert_yaxis()
plt.show()
xgb = XGBRegressor(random_state = 100)
xgb_r2, xgb_mae, xgb_rmse = model_performance(xgb, X_train, X_test, y_train, y_test)
R^2 score = 0.9474017089857787 MAE: 26021.296829411407 RMSE 34236.10852667363
xgb_importance = pd.DataFrame(list(zip(X_train.columns, xgb.feature_importances_)), columns =['Features', 'Coeff'])
xgb_importance = xgb_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
xgb_importance
| Features | Coeff | |
|---|---|---|
| 0 | floor_area_sqm | 0.152956 |
| 1 | flat_model_New Generation | 0.124490 |
| 2 | distance_to_cbd_km | 0.094415 |
| 3 | flat_model_DBSS | 0.077454 |
| 4 | town_BUKIT TIMAH | 0.062785 |
| 5 | town_BISHAN | 0.040337 |
| 6 | town_HOUGANG | 0.039884 |
| 7 | town_CLEMENTI | 0.034776 |
| 8 | town_TAMPINES | 0.032404 |
| 9 | town_MARINE PARADE | 0.029710 |
| 10 | storey_range | 0.029125 |
| 11 | flat_model_Model A | 0.021334 |
| 12 | town_PUNGGOL | 0.018742 |
| 13 | town_BUKIT MERAH | 0.018397 |
| 14 | remaining_lease | 0.018202 |
| 15 | town_JURONG EAST | 0.015699 |
| 16 | town_SENGKANG | 0.013202 |
| 17 | shortest_dist_mrt | 0.013138 |
| 18 | town_JURONG WEST | 0.012497 |
| 19 | town_BUKIT BATOK | 0.011652 |
| 20 | town_BUKIT PANJANG | 0.011232 |
| 21 | town_QUEENSTOWN | 0.011013 |
| 22 | town_SERANGOON | 0.010040 |
| 23 | town_GEYLANG | 0.009094 |
| 24 | town_CHOA CHU KANG | 0.008909 |
| 25 | flat_model_Model A2 | 0.008478 |
| 26 | town_BEDOK | 0.007518 |
| 27 | shortest_dist_mall | 0.007339 |
| 28 | town_PASIR RIS | 0.007315 |
| 29 | flat_model_Premium Apartment | 0.006772 |
| 30 | town_YISHUN | 0.006349 |
| 31 | town_TOA PAYOH | 0.006226 |
| 32 | town_CENTRAL AREA | 0.005686 |
| 33 | good_sch_score | 0.005535 |
| 34 | flat_model_Improved | 0.005306 |
| 35 | town_SEMBAWANG | 0.004508 |
| 36 | town_KALLANG/WHAMPOA | 0.003907 |
| 37 | flat_model_Standard | 0.003886 |
| 38 | flat_model_Simplified | 0.003183 |
| 39 | school_shortest_dist | 0.002439 |
| 40 | town_WOODLANDS | 0.002157 |
| 41 | flat_model_Maisonette | 0.001907 |
XGBoost Feature Importance
fig, ax = plt.subplots(figsize=(10,10))
xgb_importance_graph = xgb_importance.head(20)
graph = ax.barh(xgb_importance_graph['Features'], xgb_importance_graph['Coeff'])
plt.title('XGBoost Feature Importance (Top 20) - 2013 to 2023', size = 13)
plt.annotate(('Test R\u00b2: ' + str(round(xgb_r2,3)) + '\nTest MAE: ' + str(round(xgb_mae))+
'\nTest RMSE: ' + str(round(xgb_rmse))),
xy=(0.135,18), fontsize = 13)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.22)
ax.invert_yaxis()
plt.show()
dropped_features_2 = ['mrt_count_within_1km','mall_count_within_1km',
'matured','good_sch_count_within_1km','flat_type','good_sch_score',
'year']
df_2 = data.drop(columns = dropped_features_2)
# Split target variable from dataset
y_2= df_2['adjusted_price']
X_2 = df_2.drop(columns='adjusted_price')
X2_train,X2_test,y2_train,y2_test=train_test_split(X_2,y_2,test_size=0.2,random_state=100)
# Scale dataset
continuous_features = ['floor_area_sqm', 'remaining_lease','shortest_dist_mrt',
'shortest_dist_mall', 'school_shortest_dist', 'distance_to_cbd_km']
scaler = StandardScaler()
X2_train_continuous = X2_train[continuous_features]
X2_train_continuous = scaler.fit_transform(X2_train_continuous)
X2_test_continuous = X2_test[continuous_features]
X2_test_continuous = scaler.transform(X2_test_continuous)
# Replace original continuous features with scaled ones in X
X2_train[continuous_features] = X2_train_continuous
X2_test[continuous_features] = X2_test_continuous
Running Linear Regression to check coefficient of good_sch_score_SAP_GEP
lr2 = LinearRegression()
lin_reg_2_r2, lin_reg_2_mae, lin_reg_2_rmse = model_performance(lr2, X2_train, X2_test, y2_train, y2_test)
R^2 score = 0.8468257881373729 MAE: 46001.168146369855 RMSE 58424.015762086485
lr_2_importance = pd.DataFrame(list(zip(lr2.feature_names_in_, lr2.coef_)), columns =['Features', 'Coeff'])
lr_2_importance = lr_2_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
lr_2_importance
| Features | Coeff | |
|---|---|---|
| 0 | town_MARINE PARADE | 180559.685439 |
| 1 | town_BUKIT TIMAH | 172260.109590 |
| 2 | floor_area_sqm | 101163.786164 |
| 3 | flat_model_DBSS | 96392.406467 |
| 4 | town_BISHAN | 66222.003068 |
| 5 | remaining_lease | 62379.280902 |
| 6 | town_QUEENSTOWN | 57554.300906 |
| 7 | flat_model_Maisonette | 37787.053763 |
| 8 | town_BUKIT MERAH | 34885.110961 |
| 9 | town_CLEMENTI | 29971.824165 |
| 10 | storey_range | 14074.526464 |
| 11 | town_BEDOK | 10422.200086 |
| 12 | town_TOA PAYOH | 9374.820768 |
| 13 | school_shortest_dist | 5578.299926 |
| 14 | good_sch_score_sap_gep | 1165.568196 |
| 15 | town_SERANGOON | 235.824063 |
| 16 | town_TAMPINES | -3249.194358 |
| 17 | shortest_dist_mall | -9734.985256 |
| 18 | flat_model_Simplified | -10529.464756 |
| 19 | town_JURONG EAST | -10684.563223 |
| 20 | town_GEYLANG | -11546.867920 |
| 21 | flat_model_Standard | -14021.056507 |
| 22 | town_KALLANG/WHAMPOA | -14446.040576 |
| 23 | flat_model_New Generation | -18469.235664 |
| 24 | town_CENTRAL AREA | -20317.722650 |
| 25 | flat_model_Premium Apartment | -20651.830504 |
| 26 | shortest_dist_mrt | -23269.491895 |
| 27 | flat_model_Model A2 | -29110.214431 |
| 28 | town_YISHUN | -30341.511250 |
| 29 | flat_model_Improved | -30572.832746 |
| 30 | town_PASIR RIS | -31946.388680 |
| 31 | flat_model_Model A | -33277.231920 |
| 32 | town_PUNGGOL | -33358.040305 |
| 33 | town_BUKIT BATOK | -34476.244802 |
| 34 | town_JURONG WEST | -39008.384206 |
| 35 | distance_to_cbd_km | -48683.240896 |
| 36 | town_BUKIT PANJANG | -58715.832864 |
| 37 | town_HOUGANG | -61281.877000 |
| 38 | town_WOODLANDS | -65884.191569 |
| 39 | town_SEMBAWANG | -81955.302185 |
| 40 | town_SENGKANG | -87261.418942 |
| 41 | town_CHOA CHU KANG | -92325.547371 |
X2_constant = sm.add_constant(X2_train)
lr_ols = sm.OLS(y2_train,X2_constant).fit()
print(lr_ols.summary())
OLS Regression Results
==============================================================================
Dep. Variable: adjusted_price R-squared: 0.849
Model: OLS Adj. R-squared: 0.849
Method: Least Squares F-statistic: 2.124e+04
Date: Tue, 04 Apr 2023 Prob (F-statistic): 0.00
Time: 20:33:27 Log-Likelihood: -1.9693e+06
No. Observations: 158905 AIC: 3.939e+06
Df Residuals: 158862 BIC: 3.939e+06
Df Model: 42
Covariance Type: nonrobust
================================================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------------------------
const 4.778e+05 1295.258 368.871 0.000 4.75e+05 4.8e+05
storey_range 1.407e+04 85.876 163.893 0.000 1.39e+04 1.42e+04
floor_area_sqm 1.012e+05 210.776 479.959 0.000 1.01e+05 1.02e+05
remaining_lease 6.238e+04 232.178 268.670 0.000 6.19e+04 6.28e+04
shortest_dist_mrt -2.327e+04 180.403 -128.986 0.000 -2.36e+04 -2.29e+04
shortest_dist_mall -9734.9853 180.810 -53.841 0.000 -1.01e+04 -9380.601
school_shortest_dist 5578.2999 165.630 33.679 0.000 5253.669 5902.931
good_sch_score_sap_gep 1165.5682 372.796 3.127 0.002 434.896 1896.240
distance_to_cbd_km -4.868e+04 800.739 -60.798 0.000 -5.03e+04 -4.71e+04
town_BEDOK 1.042e+04 964.403 10.807 0.000 8531.991 1.23e+04
town_BISHAN 6.622e+04 1439.835 45.993 0.000 6.34e+04 6.9e+04
town_BUKIT BATOK -3.448e+04 1330.692 -25.909 0.000 -3.71e+04 -3.19e+04
town_BUKIT MERAH 3.489e+04 1521.754 22.924 0.000 3.19e+04 3.79e+04
town_BUKIT PANJANG -5.872e+04 1462.859 -40.138 0.000 -6.16e+04 -5.58e+04
town_BUKIT TIMAH 1.723e+05 3069.938 56.112 0.000 1.66e+05 1.78e+05
town_CENTRAL AREA -2.032e+04 2527.608 -8.038 0.000 -2.53e+04 -1.54e+04
town_CHOA CHU KANG -9.233e+04 1684.087 -54.822 0.000 -9.56e+04 -8.9e+04
town_CLEMENTI 2.997e+04 1244.070 24.092 0.000 2.75e+04 3.24e+04
town_GEYLANG -1.155e+04 1381.914 -8.356 0.000 -1.43e+04 -8838.345
town_HOUGANG -6.128e+04 1030.984 -59.440 0.000 -6.33e+04 -5.93e+04
town_JURONG EAST -1.068e+04 1502.543 -7.111 0.000 -1.36e+04 -7739.611
town_JURONG WEST -3.901e+04 1776.893 -21.953 0.000 -4.25e+04 -3.55e+04
town_KALLANG/WHAMPOA -1.445e+04 1545.822 -9.345 0.000 -1.75e+04 -1.14e+04
town_MARINE PARADE 1.806e+05 2083.430 86.665 0.000 1.76e+05 1.85e+05
town_PASIR RIS -3.195e+04 1539.185 -20.755 0.000 -3.5e+04 -2.89e+04
town_PUNGGOL -3.336e+04 1417.832 -23.527 0.000 -3.61e+04 -3.06e+04
town_QUEENSTOWN 5.755e+04 1326.919 43.374 0.000 5.5e+04 6.02e+04
town_SEMBAWANG -8.196e+04 2111.271 -38.818 0.000 -8.61e+04 -7.78e+04
town_SENGKANG -8.726e+04 1165.971 -74.840 0.000 -8.95e+04 -8.5e+04
town_SERANGOON 235.8241 1315.948 0.179 0.858 -2343.406 2815.054
town_TAMPINES -3249.1944 1189.428 -2.732 0.006 -5580.448 -917.940
town_TOA PAYOH 9374.8208 1378.601 6.800 0.000 6672.791 1.21e+04
town_WOODLANDS -6.588e+04 1905.801 -34.570 0.000 -6.96e+04 -6.21e+04
town_YISHUN -3.034e+04 1510.103 -20.092 0.000 -3.33e+04 -2.74e+04
flat_model_DBSS 9.639e+04 1556.799 61.917 0.000 9.33e+04 9.94e+04
flat_model_Improved -3.057e+04 874.308 -34.968 0.000 -3.23e+04 -2.89e+04
flat_model_Maisonette 3.779e+04 1135.881 33.267 0.000 3.56e+04 4e+04
flat_model_Model A -3.328e+04 907.136 -36.684 0.000 -3.51e+04 -3.15e+04
flat_model_Model A2 -2.911e+04 1586.464 -18.349 0.000 -3.22e+04 -2.6e+04
flat_model_New Generation -1.847e+04 1050.678 -17.578 0.000 -2.05e+04 -1.64e+04
flat_model_Premium Apartment -2.065e+04 973.035 -21.224 0.000 -2.26e+04 -1.87e+04
flat_model_Simplified -1.053e+04 1195.637 -8.807 0.000 -1.29e+04 -8186.041
flat_model_Standard -1.402e+04 1297.473 -10.806 0.000 -1.66e+04 -1.15e+04
==============================================================================
Omnibus: 9903.857 Durbin-Watson: 2.003
Prob(Omnibus): 0.000 Jarque-Bera (JB): 13479.542
Skew: 0.566 Prob(JB): 0.00
Kurtosis: 3.869 Cond. No. 106.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
lr_2_importance['Coeff_ABS'] = np.abs(lr_2_importance['Coeff'])
lr_2_importance = lr_2_importance.sort_values('Coeff_ABS', ascending = False).reset_index(drop= True)
lr_2_importance['color'] = lr_2_importance['Coeff'].apply(lambda x: '#66ff8c' if x>0 else '#ff8c66')
lr_2_importance
| Features | Coeff | Coeff_ABS | color | |
|---|---|---|---|---|
| 0 | town_MARINE PARADE | 180559.685439 | 180559.685439 | #66ff8c |
| 1 | town_BUKIT TIMAH | 172260.109590 | 172260.109590 | #66ff8c |
| 2 | floor_area_sqm | 101163.786164 | 101163.786164 | #66ff8c |
| 3 | flat_model_DBSS | 96392.406467 | 96392.406467 | #66ff8c |
| 4 | town_CHOA CHU KANG | -92325.547371 | 92325.547371 | #ff8c66 |
| 5 | town_SENGKANG | -87261.418942 | 87261.418942 | #ff8c66 |
| 6 | town_SEMBAWANG | -81955.302185 | 81955.302185 | #ff8c66 |
| 7 | town_BISHAN | 66222.003068 | 66222.003068 | #66ff8c |
| 8 | town_WOODLANDS | -65884.191569 | 65884.191569 | #ff8c66 |
| 9 | remaining_lease | 62379.280902 | 62379.280902 | #66ff8c |
| 10 | town_HOUGANG | -61281.877000 | 61281.877000 | #ff8c66 |
| 11 | town_BUKIT PANJANG | -58715.832864 | 58715.832864 | #ff8c66 |
| 12 | town_QUEENSTOWN | 57554.300906 | 57554.300906 | #66ff8c |
| 13 | distance_to_cbd_km | -48683.240896 | 48683.240896 | #ff8c66 |
| 14 | town_JURONG WEST | -39008.384206 | 39008.384206 | #ff8c66 |
| 15 | flat_model_Maisonette | 37787.053763 | 37787.053763 | #66ff8c |
| 16 | town_BUKIT MERAH | 34885.110961 | 34885.110961 | #66ff8c |
| 17 | town_BUKIT BATOK | -34476.244802 | 34476.244802 | #ff8c66 |
| 18 | town_PUNGGOL | -33358.040305 | 33358.040305 | #ff8c66 |
| 19 | flat_model_Model A | -33277.231920 | 33277.231920 | #ff8c66 |
| 20 | town_PASIR RIS | -31946.388680 | 31946.388680 | #ff8c66 |
| 21 | flat_model_Improved | -30572.832746 | 30572.832746 | #ff8c66 |
| 22 | town_YISHUN | -30341.511250 | 30341.511250 | #ff8c66 |
| 23 | town_CLEMENTI | 29971.824165 | 29971.824165 | #66ff8c |
| 24 | flat_model_Model A2 | -29110.214431 | 29110.214431 | #ff8c66 |
| 25 | shortest_dist_mrt | -23269.491895 | 23269.491895 | #ff8c66 |
| 26 | flat_model_Premium Apartment | -20651.830504 | 20651.830504 | #ff8c66 |
| 27 | town_CENTRAL AREA | -20317.722650 | 20317.722650 | #ff8c66 |
| 28 | flat_model_New Generation | -18469.235664 | 18469.235664 | #ff8c66 |
| 29 | town_KALLANG/WHAMPOA | -14446.040576 | 14446.040576 | #ff8c66 |
| 30 | storey_range | 14074.526464 | 14074.526464 | #66ff8c |
| 31 | flat_model_Standard | -14021.056507 | 14021.056507 | #ff8c66 |
| 32 | town_GEYLANG | -11546.867920 | 11546.867920 | #ff8c66 |
| 33 | town_JURONG EAST | -10684.563223 | 10684.563223 | #ff8c66 |
| 34 | flat_model_Simplified | -10529.464756 | 10529.464756 | #ff8c66 |
| 35 | town_BEDOK | 10422.200086 | 10422.200086 | #66ff8c |
| 36 | shortest_dist_mall | -9734.985256 | 9734.985256 | #ff8c66 |
| 37 | town_TOA PAYOH | 9374.820768 | 9374.820768 | #66ff8c |
| 38 | school_shortest_dist | 5578.299926 | 5578.299926 | #66ff8c |
| 39 | town_TAMPINES | -3249.194358 | 3249.194358 | #ff8c66 |
| 40 | good_sch_score_sap_gep | 1165.568196 | 1165.568196 | #66ff8c |
| 41 | town_SERANGOON | 235.824063 | 235.824063 | #66ff8c |
fig, ax = plt.subplots(figsize=(10,10))
graph = ax.barh(lr_2_importance['Features'], lr_2_importance['Coeff_ABS'],
color=lr_2_importance['color'])
colors = {'Positive':'#66ff8c', 'Negative':'#ff8c66'}
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels, title='Relationship',loc='lower right', fontsize ='10')
plt.title('Linear Regression Feature importance (good_sch_score_SAP_GEP)', size = 15)
plt.ylabel('Features', size = 12)
plt.xlabel('Coefficient', size = 12)
ax.bar_label(graph, fmt='%.0f', padding =10)
ax.set_xlim(right=150000)
ax.invert_yaxis()
plt.show()
We observe a positive coefficient for good_sch_score_sap_gep, which makes sense intuitively, as being near a good school would be associated with higher hdb resale prices.
Running a default parameter Random Forest Model.
rf_2 = RandomForestRegressor(random_state = 100)
rf_2_r2, rf_2_mae, rf_2_rmse = model_performance(rf_2, X2_train, X2_test, y2_train, y2_test)
rf_2_importance = pd.DataFrame(list(zip(rf_2.feature_names_in_, rf_2.feature_importances_)), columns =['Features', 'Coeff'])
rf_2_importance = rf_2_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
rf_2_importance
R^2 score = 0.9654625455113067 MAE: 19907.447739693638 RMSE 27761.688623186787
| Features | Coeff | |
|---|---|---|
| 0 | floor_area_sqm | 0.492021 |
| 1 | distance_to_cbd_km | 0.244590 |
| 2 | remaining_lease | 0.116498 |
| 3 | storey_range | 0.040999 |
| 4 | shortest_dist_mrt | 0.027000 |
| 5 | shortest_dist_mall | 0.019661 |
| 6 | school_shortest_dist | 0.009687 |
| 7 | flat_model_DBSS | 0.008312 |
| 8 | flat_model_Model A | 0.004973 |
| 9 | town_BUKIT TIMAH | 0.004021 |
| 10 | good_sch_score_sap_gep | 0.003456 |
| 11 | flat_model_Improved | 0.003264 |
| 12 | town_TAMPINES | 0.002669 |
| 13 | town_HOUGANG | 0.002394 |
| 14 | town_BISHAN | 0.002073 |
| 15 | town_JURONG WEST | 0.001733 |
| 16 | town_CLEMENTI | 0.001713 |
| 17 | town_SENGKANG | 0.001387 |
| 18 | town_PUNGGOL | 0.001247 |
| 19 | town_BUKIT PANJANG | 0.001161 |
| 20 | flat_model_Premium Apartment | 0.000997 |
| 21 | town_PASIR RIS | 0.000920 |
| 22 | town_BEDOK | 0.000885 |
| 23 | town_GEYLANG | 0.000858 |
| 24 | town_BUKIT BATOK | 0.000854 |
| 25 | town_BUKIT MERAH | 0.000834 |
| 26 | town_MARINE PARADE | 0.000823 |
| 27 | town_QUEENSTOWN | 0.000770 |
| 28 | town_JURONG EAST | 0.000734 |
| 29 | town_KALLANG/WHAMPOA | 0.000440 |
| 30 | town_CHOA CHU KANG | 0.000436 |
| 31 | flat_model_New Generation | 0.000360 |
| 32 | town_YISHUN | 0.000323 |
| 33 | flat_model_Maisonette | 0.000305 |
| 34 | flat_model_Standard | 0.000292 |
| 35 | town_TOA PAYOH | 0.000257 |
| 36 | town_WOODLANDS | 0.000227 |
| 37 | flat_model_Simplified | 0.000226 |
| 38 | town_CENTRAL AREA | 0.000211 |
| 39 | town_SERANGOON | 0.000201 |
| 40 | town_SEMBAWANG | 0.000103 |
| 41 | flat_model_Model A2 | 0.000086 |
fig, ax = plt.subplots(figsize=(10,10))
rf_2_importance_graph = rf_2_importance.head(20)
graph = ax.barh(rf_2_importance_graph['Features'], rf_2_importance_graph['Coeff'])
plt.title('Random Forest Feature importance (good_sch_score_SAP_GEP)', size = 15)
plt.annotate(('Test R\u00b2: ' + str(round(rf_2_r2,3)) + '\nTest MAE: ' + str(round(rf_2_mae))+
'\nTest RMSE: ' + str(round(rf_2_rmse))),
xy=(0.4,18), fontsize = 12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.6)
ax.invert_yaxis()
plt.show()
We observe no significant differ in feature importance from changing good_sch_score to good_sch_score_sap_gep.
dropped_features_2 = ['mrt_count_within_1km','mall_count_within_1km',
'matured','good_sch_count_within_1km','floor_area_sqm','flat_type',
'good_sch_score', 'year', 'distance_to_cbd_km', 'remaining_lease']
df_2 = data.drop(columns = dropped_features_2)
# Split target variable from dataset
y_2= df_2['adjusted_price']
X_2 = df_2.drop(columns='adjusted_price')
X2_train,X2_test,y2_train,y2_test=train_test_split(X_2,y_2,test_size=0.2,random_state=100)
# Scale dataset
continuous_features = ['shortest_dist_mrt',
'shortest_dist_mall', 'school_shortest_dist']
scaler = StandardScaler()
X2_train_continuous = X2_train[continuous_features]
X2_train_continuous = scaler.fit_transform(X2_train_continuous)
X2_test_continuous = X2_test[continuous_features]
X2_test_continuous = scaler.transform(X2_test_continuous)
# Replace original continuous features with scaled ones in X
X2_train[continuous_features] = X2_train_continuous
X2_test[continuous_features] = X2_test_continuous
Running Linear Regression
lr3 = LinearRegression()
lin_reg_3_r2, lin_reg_3_mae, lin_reg_3_rmse = model_performance(lr3, X2_train, X2_test, y2_train, y2_test)
lr3_importance = pd.DataFrame(list(zip(lr3.feature_names_in_, lr3.coef_)), columns =['Features', 'Coeff'])
lr3_importance.sort_values(by = 'Coeff', ascending = False)
R^2 score = 0.5189909682843437 MAE: 78505.61840751792 RMSE 103532.14051944068
| Features | Coeff | |
|---|---|---|
| 10 | town_BUKIT TIMAH | 160246.541525 |
| 6 | town_BISHAN | 120322.238742 |
| 19 | town_MARINE PARADE | 103211.609514 |
| 8 | town_BUKIT MERAH | 78841.387845 |
| 32 | flat_model_Maisonette | 45504.043389 |
| 25 | town_SERANGOON | 31814.260142 |
| 22 | town_QUEENSTOWN | 28962.004743 |
| 13 | town_CLEMENTI | 23730.519842 |
| 0 | storey_range | 21719.818110 |
| 4 | good_sch_score_sap_gep | 3908.884211 |
| 30 | flat_model_DBSS | 3498.705413 |
| 3 | school_shortest_dist | 1400.129841 |
| 18 | town_KALLANG/WHAMPOA | -2299.867659 |
| 5 | town_BEDOK | -7330.595225 |
| 2 | shortest_dist_mall | -8760.642504 |
| 20 | town_PASIR RIS | -13105.636339 |
| 26 | town_TAMPINES | -15267.153911 |
| 1 | shortest_dist_mrt | -22156.036553 |
| 21 | town_PUNGGOL | -22973.947558 |
| 14 | town_GEYLANG | -23902.851483 |
| 16 | town_JURONG EAST | -30402.426937 |
| 27 | town_TOA PAYOH | -30615.310208 |
| 15 | town_HOUGANG | -36032.332666 |
| 24 | town_SENGKANG | -57485.900833 |
| 7 | town_BUKIT BATOK | -61726.696238 |
| 9 | town_BUKIT PANJANG | -62976.919705 |
| 29 | town_YISHUN | -66774.743612 |
| 11 | town_CENTRAL AREA | -75833.892388 |
| 12 | town_CHOA CHU KANG | -93253.350526 |
| 28 | town_WOODLANDS | -94331.864853 |
| 17 | town_JURONG WEST | -97552.827875 |
| 23 | town_SEMBAWANG | -114511.421800 |
| 36 | flat_model_Premium Apartment | -148523.052135 |
| 31 | flat_model_Improved | -195578.048569 |
| 33 | flat_model_Model A | -209345.145233 |
| 34 | flat_model_Model A2 | -242595.430405 |
| 37 | flat_model_Simplified | -289924.945252 |
| 38 | flat_model_Standard | -297104.431044 |
| 35 | flat_model_New Generation | -313787.153975 |
lr3_importance['Coeff_ABS'] = np.abs(lr3_importance['Coeff'])
lr3_importance = lr3_importance.sort_values('Coeff_ABS', ascending = False).reset_index(drop= True)
lr3_importance['color'] = lr3_importance['Coeff'].apply(lambda x: '#66ff8c' if x>0 else '#ff8c66')
lr3_importance
| Features | Coeff | Coeff_ABS | color | |
|---|---|---|---|---|
| 0 | flat_model_New Generation | -313787.153975 | 313787.153975 | #ff8c66 |
| 1 | flat_model_Standard | -297104.431044 | 297104.431044 | #ff8c66 |
| 2 | flat_model_Simplified | -289924.945252 | 289924.945252 | #ff8c66 |
| 3 | flat_model_Model A2 | -242595.430405 | 242595.430405 | #ff8c66 |
| 4 | flat_model_Model A | -209345.145233 | 209345.145233 | #ff8c66 |
| 5 | flat_model_Improved | -195578.048569 | 195578.048569 | #ff8c66 |
| 6 | town_BUKIT TIMAH | 160246.541525 | 160246.541525 | #66ff8c |
| 7 | flat_model_Premium Apartment | -148523.052135 | 148523.052135 | #ff8c66 |
| 8 | town_BISHAN | 120322.238742 | 120322.238742 | #66ff8c |
| 9 | town_SEMBAWANG | -114511.421800 | 114511.421800 | #ff8c66 |
| 10 | town_MARINE PARADE | 103211.609514 | 103211.609514 | #66ff8c |
| 11 | town_JURONG WEST | -97552.827875 | 97552.827875 | #ff8c66 |
| 12 | town_WOODLANDS | -94331.864853 | 94331.864853 | #ff8c66 |
| 13 | town_CHOA CHU KANG | -93253.350526 | 93253.350526 | #ff8c66 |
| 14 | town_BUKIT MERAH | 78841.387845 | 78841.387845 | #66ff8c |
| 15 | town_CENTRAL AREA | -75833.892388 | 75833.892388 | #ff8c66 |
| 16 | town_YISHUN | -66774.743612 | 66774.743612 | #ff8c66 |
| 17 | town_BUKIT PANJANG | -62976.919705 | 62976.919705 | #ff8c66 |
| 18 | town_BUKIT BATOK | -61726.696238 | 61726.696238 | #ff8c66 |
| 19 | town_SENGKANG | -57485.900833 | 57485.900833 | #ff8c66 |
| 20 | flat_model_Maisonette | 45504.043389 | 45504.043389 | #66ff8c |
| 21 | town_HOUGANG | -36032.332666 | 36032.332666 | #ff8c66 |
| 22 | town_SERANGOON | 31814.260142 | 31814.260142 | #66ff8c |
| 23 | town_TOA PAYOH | -30615.310208 | 30615.310208 | #ff8c66 |
| 24 | town_JURONG EAST | -30402.426937 | 30402.426937 | #ff8c66 |
| 25 | town_QUEENSTOWN | 28962.004743 | 28962.004743 | #66ff8c |
| 26 | town_GEYLANG | -23902.851483 | 23902.851483 | #ff8c66 |
| 27 | town_CLEMENTI | 23730.519842 | 23730.519842 | #66ff8c |
| 28 | town_PUNGGOL | -22973.947558 | 22973.947558 | #ff8c66 |
| 29 | shortest_dist_mrt | -22156.036553 | 22156.036553 | #ff8c66 |
| 30 | storey_range | 21719.818110 | 21719.818110 | #66ff8c |
| 31 | town_TAMPINES | -15267.153911 | 15267.153911 | #ff8c66 |
| 32 | town_PASIR RIS | -13105.636339 | 13105.636339 | #ff8c66 |
| 33 | shortest_dist_mall | -8760.642504 | 8760.642504 | #ff8c66 |
| 34 | town_BEDOK | -7330.595225 | 7330.595225 | #ff8c66 |
| 35 | good_sch_score_sap_gep | 3908.884211 | 3908.884211 | #66ff8c |
| 36 | flat_model_DBSS | 3498.705413 | 3498.705413 | #66ff8c |
| 37 | town_KALLANG/WHAMPOA | -2299.867659 | 2299.867659 | #ff8c66 |
| 38 | school_shortest_dist | 1400.129841 | 1400.129841 | #66ff8c |
fig, ax = plt.subplots(figsize=(10,10))
graph = ax.barh(lr3_importance['Features'], lr3_importance['Coeff_ABS'],
color=lr3_importance['color'])
colors = {'Positive':'#66ff8c', 'Negative':'#ff8c66'}
labels = list(colors.keys())
handles = [plt.Rectangle((0,0),1,1, color=colors[label]) for label in labels]
plt.legend(handles, labels, title='Relationship',loc='lower right', fontsize ='10')
plt.title('Linear Regression Feature importance (Other Analysis) 2013-2023', size = 15)
plt.ylabel('Features', size = 12)
plt.xlabel('Coefficient', size = 12)
ax.bar_label(graph, fmt='%.0f', padding =10)
ax.set_xlim(right=350000)
ax.invert_yaxis()
plt.show()
Running Default Parameter Random Forest
rf_3 = RandomForestRegressor(random_state = 100)
rf_3_r2, rf_3_mae, rf_3_rmse = model_performance(rf_3, X2_train, X2_test, y2_train, y2_test)
rf_3_importance = pd.DataFrame(list(zip(rf_3.feature_names_in_, rf_3.feature_importances_)), columns =['Features', 'Coeff'])
rf_3_importance = rf_3_importance.sort_values(by = 'Coeff', ascending = False).reset_index(drop =True)
rf_3_importance
R^2 score = 0.8521889572083702 MAE: 42414.281049437544 RMSE 57392.08704989647
| Features | Coeff | |
|---|---|---|
| 0 | storey_range | 0.142879 |
| 1 | shortest_dist_mrt | 0.129558 |
| 2 | flat_model_Maisonette | 0.096262 |
| 3 | shortest_dist_mall | 0.096108 |
| 4 | school_shortest_dist | 0.091538 |
| 5 | flat_model_New Generation | 0.059715 |
| 6 | good_sch_score_sap_gep | 0.032448 |
| 7 | town_BUKIT MERAH | 0.031518 |
| 8 | flat_model_Premium Apartment | 0.028193 |
| 9 | flat_model_DBSS | 0.027421 |
| 10 | town_BISHAN | 0.022764 |
| 11 | town_QUEENSTOWN | 0.021478 |
| 12 | flat_model_Model A | 0.020548 |
| 13 | flat_model_Improved | 0.018232 |
| 14 | flat_model_Simplified | 0.017427 |
| 15 | flat_model_Standard | 0.015902 |
| 16 | flat_model_Model A2 | 0.013429 |
| 17 | town_MARINE PARADE | 0.010216 |
| 18 | town_GEYLANG | 0.010107 |
| 19 | town_BEDOK | 0.010079 |
| 20 | town_TOA PAYOH | 0.009998 |
| 21 | town_TAMPINES | 0.009823 |
| 22 | town_JURONG WEST | 0.009782 |
| 23 | town_KALLANG/WHAMPOA | 0.009717 |
| 24 | town_SERANGOON | 0.007373 |
| 25 | town_WOODLANDS | 0.006338 |
| 26 | town_HOUGANG | 0.006316 |
| 27 | town_CLEMENTI | 0.006270 |
| 28 | town_CHOA CHU KANG | 0.006046 |
| 29 | town_BUKIT BATOK | 0.005258 |
| 30 | town_BUKIT TIMAH | 0.004209 |
| 31 | town_SEMBAWANG | 0.003687 |
| 32 | town_SENGKANG | 0.003677 |
| 33 | town_BUKIT PANJANG | 0.003662 |
| 34 | town_YISHUN | 0.002835 |
| 35 | town_PUNGGOL | 0.002720 |
| 36 | town_PASIR RIS | 0.002591 |
| 37 | town_CENTRAL AREA | 0.002092 |
| 38 | town_JURONG EAST | 0.001783 |
fig, ax = plt.subplots(figsize=(8,8))
rf_3_importance_graph = rf_3_importance.head(10)
graph = ax.barh(rf_3_importance_graph['Features'], rf_3_importance_graph['Coeff'])
plt.title('Random Forest Feature importance (Other analysis) 2013-2023', size = 15)
plt.annotate(('Test R\u00b2: ' + str(round(rf_3_r2,3)) + '\nTest MAE: ' + str(round(rf_3_mae))+
'\nTest RMSE: ' + str(round(rf_3_rmse))),
xy=(0.12,9), fontsize = 12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
ax.bar_label(graph, fmt='%.5f', padding =10)
ax.set_xlim(right=0.18)
ax.invert_yaxis()
plt.show()
import shap
model = RandomForestRegressor(random_state=100)
model.fit(X_train, y_train)
shap.initjs()
explainer = shap.TreeExplainer(model)
RandomForestRegressor(random_state=100)
Smallest Y Values
y_test = y_test.reset_index()
smallest = y_test['adjusted_price'].nsmallest(2).index
shap.initjs()
smallest_1 = smallest[0]
shap_values = explainer.shap_values(X_test.iloc[[smallest_1]])
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[smallest_1]])
shap.initjs()
smallest_2 = smallest[1]
shap_values = explainer.shap_values(X_test.iloc[[smallest_2]])
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[smallest_2]])
#shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[smallest_2]],show=False,matplotlib=True).savefig('low_2014.png')
Median Y Values
temp = y_test['adjusted_price']
# get the median value of the Series
median = temp.quantile(0.5)
# set the range around the median to include the two values closest to it
range_low = median - 0.5 * (temp.max() - temp.min()) / len(temp)
range_high = median + 0.5 * (temp.max() - temp.min()) / len(temp)
# use boolean indexing to get the indexes of the values that fall within the range around the median
median_indexes = temp[(temp >= range_low) & (temp <= range_high)].index
shap.initjs()
median_1 = median_indexes[0]
shap_values = explainer.shap_values(X_test.iloc[[median_1]])
#shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[median_1]])
#shap.force_plot(explainer.expected_value[0], shap_values[0], np.round(X_test.iloc[[median_1]],4),show=False,matplotlib=True).savefig('med_2014.png')
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[median_1]])
shap.initjs()
median_2 = median_indexes[1]
shap_values = explainer.shap_values(X_test.iloc[[median_2]])
#shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[median_2]],show=False,matplotlib=True).savefig('med_2014.png')
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[median_2]])
#shap.force_plot(explainer.expected_value[0], shap_values[0], np.round(X_test.iloc[[median_2]],4),show=False,matplotlib=True).savefig('med_2014.png')
Largest Y Values
largest = y_test['adjusted_price'].nlargest(2).index
shap.initjs()
largest_1 = largest[0]
shap_values = explainer.shap_values(X_test.iloc[[largest_1]])
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[largest_1]])
shap.initjs()
largest_2 = largest[1]
shap_values = explainer.shap_values(X_test.iloc[[largest_2]])
shap.force_plot(explainer.expected_value[0], shap_values[0], X_test.iloc[[largest_2]])
shap.summary_plot(shap_values, X_test, plot_type='bar', plot_size = (8,5))